In [1]:
 #Import packages
### YOUR CODE HERE ### 

# For data manipulation
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

# For data modeling
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree

# For saving models
import pickle
In [5]:
# RUN THIS CELL TO IMPORT YOUR DATA. 

# Load dataset into a dataframe
### YOUR CODE HERE ### 
df = pd.read_csv(r"C:\Users\HP\OneDrive\Documents\Google Advanced Data Analytics\Statistics Course\HR_comma_sep.csv")

# Display first few rows of the dataframe 
df.head()
Out[5]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years Department salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
In [6]:
# Gather basic information about the data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
In [7]:
# Gather descriptive statistics about the data
df.describe()
Out[7]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000
In [ ]:
 
In [ ]:
 
In [8]:
# Display all column names
df.columns
Out[8]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')
In [8]:
# Rename columns as needed
df = df.rename(columns={'Work_accident': 'work_accident',
                          'average_montly_hours': 'average_monthly_hours',
                          'time_spend_company': 'tenure',
                          'Department': 'department'})

# Display all column names after the update 
df.columns
Out[8]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'tenure', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')
In [ ]:
 
In [10]:
df.isna().sum()
Out[10]:
satisfaction_level       0
last_evaluation          0
number_project           0
average_monthly_hours    0
tenure                   0
work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64
In [ ]:
 
In [11]:
df.duplicated().sum()
Out[11]:
3008
In [ ]:
 
In [12]:
df[df.duplicated()].head()
Out[12]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident left promotion_last_5years department salary
396 0.46 0.57 2 139 3 0 1 0 sales low
866 0.41 0.46 2 128 3 0 1 0 accounting low
1317 0.37 0.51 2 127 3 0 1 0 sales medium
1368 0.41 0.52 2 132 3 0 1 0 RandD low
1461 0.42 0.53 2 142 3 0 1 0 sales low
In [9]:
# Drop duplicates and save resulting dataframe in a new variable as needed
df1 = df.drop_duplicates(keep='first')

# Display first few rows of new dataframe as needed
df1.head()
Out[9]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident left promotion_last_5years department salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
In [ ]:
 
In [15]:
plt.figure(figsize=(8, 6))  # Slightly larger figure for better readability
sns.set_style("whitegrid")  # Set a nice background grid style
plt.title('Boxplot of Tenure Distribution and Outliers Detection', fontsize=14)
plt.xlabel('Tenure', fontsize=12)  # Added x-axis label for clarity
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Plot the boxplot
sns.boxplot(x=df1['tenure'], color='skyblue')  # Changed color for a more aesthetic plot
plt.show()
No description has been provided for this image
In [ ]:
 
In [10]:
# Step 1: Compute the 25th and 75th percentile values of `tenure`
percentile25 = df1['tenure'].quantile(0.25)
percentile75 = df1['tenure'].quantile(0.75)

# Step 2: Compute the interquartile range (IQR)
iqr = percentile75 - percentile25

# Step 3: Define the upper and lower bounds for outliers based on IQR
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr

# Display the calculated limits
print(f"25th percentile (Q1): {percentile25}")
print(f"75th percentile (Q3): {percentile75}")
print(f"IQR: {iqr}")
print(f"Lower limit for outliers: {lower_limit}")
print(f"Upper limit for outliers: {upper_limit}")

# Step 4: Identify outliers in `tenure`
outliers = df1[(df1['tenure'] < lower_limit) | (df1['tenure'] > upper_limit)]

# Step 5: Report the number of outliers and preview them
num_outliers = len(outliers)
print(f"Number of outliers in 'tenure': {num_outliers}")
print("Outlier data preview:")
outliers.head()  # Display first few rows of outliers
25th percentile (Q1): 3.0
75th percentile (Q3): 4.0
IQR: 1.0
Lower limit for outliers: 1.5
Upper limit for outliers: 5.5
Number of outliers in 'tenure': 824
Outlier data preview:
Out[10]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident left promotion_last_5years department salary
1 0.80 0.86 5 262 6 0 1 0 sales medium
17 0.78 0.99 4 255 6 0 1 0 sales low
34 0.84 0.87 4 246 6 0 1 0 hr low
47 0.57 0.70 3 273 6 0 1 0 support low
67 0.90 0.98 4 264 6 0 1 0 product_mng medium
In [ ]:
 
In [ ]:
 
In [6]:
print(df1['left'].value_counts())
print()

# Get percentages of people who left vs. stayed
### YOUR CODE HERE ###
print(df1['left'].value_counts(normalize=True))
left
0    10000
1     1991
Name: count, dtype: int64

left
0    0.833959
1    0.166041
Name: proportion, dtype: float64
In [ ]:
 
In [20]:
# Set up figure and axes for two plots
fig, ax = plt.subplots(1, 2, figsize=(22, 8))  # Same figure size, but clarified axes assignment

# Plot 1: Boxplot for average monthly hours by number of projects, with comparison for employees who stayed vs. those who left
sns.boxplot(data=df1, x='average_monthly_hours', y='number_project', hue='left', orient='h', ax=ax[0], palette='Set2')

# Improve aesthetics and labeling
ax[0].invert_yaxis()  # Keeps projects in descending order
ax[0].set_title('Average Monthly Hours by Number of Projects (Stay vs. Left)', fontsize=16)
ax[0].set_xlabel('Average Monthly Hours', fontsize=12)
ax[0].set_ylabel('Number of Projects', fontsize=12)
ax[0].legend(title='Employee Left', loc='upper right', fontsize=12)

# Plot 2: Histogram showing distribution of number of projects for employees who stayed vs. left
sns.histplot(data=df1, x='number_project', hue='left', multiple='dodge', shrink=0.8, ax=ax[1], palette='Set1', binwidth=1)

# Improve aesthetics and labeling
ax[1].set_title('Distribution of Number of Projects (Stay vs. Left)', fontsize=16)
ax[1].set_xlabel('Number of Projects', fontsize=12)
ax[1].set_ylabel('Count', fontsize=12)

# Adjust layout and display the plots
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [21]:
# Get value counts of stayed/left for employees with 7 projects
df1[df1['number_project']==7]['left'].value_counts()
Out[21]:
left
1    145
Name: count, dtype: int64
In [ ]:
 
In [23]:
# Create a scatterplot of average monthly hours vs. satisfaction level, comparing employees who stayed vs. those who left
plt.figure(figsize=(16, 9))

# Scatter plot with more vibrant colors and transparency for clarity
sns.scatterplot(data=df1, x='average_monthly_hours', y='satisfaction_level', hue='left', palette='viridis', alpha=0.6, s=100)

# Add a vertical line representing the average monthly hours (166.67)
plt.axvline(x=166.67, color='#ff6361', linestyle='--', linewidth=2, label='166.67 hrs./mo.')

# Update the legend to be more descriptive
plt.legend(title='Employee Left', labels=['Average Monthly Hours (166.67)', 'Left', 'Stayed'], fontsize=12, title_fontsize=14, loc='upper right')

# Improve titles and labels
plt.title('Average Monthly Hours vs. Satisfaction Level (Stay vs. Left)', fontsize=18, fontweight='bold')
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Satisfaction Level', fontsize=14)

# Adjust tick label sizes for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [25]:
# Set the plot style and color palette with only 2 colors for the hue
sns.set_style('whitegrid')
palette = sns.color_palette("Set2", 2)  # Limit the palette to 2 colors

# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize=(22, 8))

# Plot 1: Boxplot showing distributions of `satisfaction_level` by tenure, comparing employees who stayed versus those who left
sns.boxplot(data=df1, x='satisfaction_level', y='tenure', hue='left', orient="h", ax=ax[0], palette=palette)
ax[0].invert_yaxis()  # Invert y-axis to have the highest tenure at the top
ax[0].set_title('Satisfaction by Tenure', fontsize=16)
ax[0].set_xlabel('Satisfaction Level', fontsize=14)
ax[0].set_ylabel('Tenure (Years)', fontsize=14)
ax[0].tick_params(axis='both', which='major', labelsize=12)
ax[0].grid(True)

# Plot 2: Histogram showing the distribution of `tenure`, comparing employees who stayed versus those who left
sns.histplot(data=df1, x='tenure', hue='left', multiple='dodge', bins=10, palette=palette, ax=ax[1], alpha=0.8)
ax[1].set_title('Tenure Histogram', fontsize=16)
ax[1].set_xlabel('Tenure (Years)', fontsize=14)
ax[1].set_ylabel('Count', fontsize=14)
ax[1].tick_params(axis='both', which='major', labelsize=12)
ax[1].grid(True)

# Adjust layout for better readability
plt.tight_layout()

# Show the plots
plt.show()
No description has been provided for this image
In [ ]:
 
In [7]:
# Calculate mean and median satisfaction scores of employees who left and those who stayed
df1.groupby(['left'])['satisfaction_level'].agg(['mean', 'median'])
Out[7]:
mean median
left
0 0.667365 0.69
1 0.440271 0.41
In [ ]:
 
In [28]:
# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize=(22, 8))

# Define short-tenured employees
tenure_short = df1[df1['tenure'] < 7]

# Define long-tenured employees
tenure_long = df1[df1['tenure'] >= 7]  # Use `>=` for better clarity and inclusive grouping

# Plot short-tenured histogram
sns.histplot(data=tenure_short, x='tenure', hue='salary', discrete=True, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=0.6, 
             palette='Set2', ax=ax[0])

# Improve axis labels, title, and grid for the first plot
ax[0].set_title('Salary Distribution by Tenure: Short-Tenured Employees', fontsize=16)
ax[0].set_xlabel('Tenure (Years)', fontsize=14)
ax[0].set_ylabel('Count', fontsize=14)
ax[0].tick_params(axis='both', which='major', labelsize=12)
ax[0].grid(True)

# Plot long-tenured histogram
sns.histplot(data=tenure_long, x='tenure', hue='salary', discrete=True, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=0.6, 
             palette='Set1', ax=ax[1])

# Improve axis labels, title, and grid for the second plot
ax[1].set_title('Salary Distribution by Tenure: Long-Tenured Employees', fontsize=16)
ax[1].set_xlabel('Tenure (Years)', fontsize=14)
ax[1].set_ylabel('Count', fontsize=14)
ax[1].tick_params(axis='both', which='major', labelsize=12)
ax[1].grid(True)

# Adjust layout for better spacing
plt.tight_layout()

# Display the plots
plt.show()
No description has been provided for this image
In [ ]:
 
In [30]:
# Set the figure size and axes
plt.figure(figsize=(16, 6))

# Create scatterplot to examine the relationship between average monthly hours and promotion in the last 5 years
sns.scatterplot(data=df1, x='average_monthly_hours', y='promotion_last_5years', hue='left', 
                palette='coolwarm', alpha=0.6, s=100)

# Add a vertical line representing the average monthly hours (166.67)
plt.axvline(x=166.67, color='#ff6361', linestyle='--', linewidth=2, label='166.67 hrs./mo.')

# Update the legend with more descriptive labels
plt.legend(title='Employee Status', labels=['Average Monthly Hours (166.67)', 'Left', 'Stayed'], 
           fontsize=12, title_fontsize=14, loc='upper right')

# Improve title and axis labels
plt.title('Relationship between Monthly Hours and Promotion in Last 5 Years', fontsize=18, fontweight='bold')
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Promotion in Last 5 Years', fontsize=14)

# Adjust tick label sizes for better readability
plt.xticks(fontsize=12)
plt.yticks([0, 1], labels=['No Promotion', 'Promoted'], fontsize=12)  # Customize y-axis labels

# Display the plot with optimized layout
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [31]:
# Set the plot style and color palette
sns.set_style('whitegrid')
palette = sns.color_palette("coolwarm", 2)  # Choose a palette that differentiates 'left' and 'stayed'

# Create scatterplot of `average_monthly_hours` versus `last_evaluation`
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='last_evaluation', hue='left', alpha=0.6, palette=palette)

# Add vertical line for reference at 166.67 hours per month
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--', linewidth=2)

# Set plot title and labels
plt.title('Average Monthly Hours vs Last Evaluation Score', fontsize=18)
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Last Evaluation Score', fontsize=14)

# Adjust tick parameters for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Improve legend labels
plt.legend(labels=['166.67 hrs./mo.', 'Left', 'Stayed'], title='Legend', fontsize=12, title_fontsize=14)

# Add grid for better readability
plt.grid(True)

# Show plot
plt.show()
No description has been provided for this image
In [ ]:
 
In [32]:
df1["department"].value_counts()
Out[32]:
department
sales          3239
technical      2244
support        1821
IT              976
RandD           694
product_mng     686
marketing       673
accounting      621
hr              601
management      436
Name: count, dtype: int64
In [ ]:
 
In [37]:
# Set the plot style and color palette
sns.set_style('whitegrid')

# Create a figure for the stacked histogram
plt.figure(figsize=(11, 8))

# Create a palette with exactly two colors for the 'left' hue
palette = sns.color_palette("Set2", n_colors=2)

# Create a histogram showing department-wise distribution of employees who stayed vs left
sns.histplot(data=df1, x='department', hue='left', discrete=True, 
             hue_order=[0, 1], multiple='dodge', shrink=0.8, palette=palette)

# Rotate and align x-axis labels for better readability
plt.xticks(rotation=45, ha='right', fontsize=12)

# Set plot title and axis labels
plt.title('Department-wise Distribution of Employees: Stayed vs Left', fontsize=16)
plt.xlabel('Department', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Adjust tick size for better readability
plt.yticks(fontsize=12)

# Add gridlines for visual clarity
plt.grid(True, axis='y', linestyle='--')

# Improve the legend
plt.legend(title='Status', labels=['Stayed', 'Left'], fontsize=12, title_fontsize=14)

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [ ]:
 
In [11]:
# Copy the dataframe
df_enc = df1.copy()

# Encode the `salary` column as an ordinal numeric category (with handling of missing values, if any)
df_enc['salary'] = (
    df_enc['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high'], ordered=True)
    .cat.codes
)

# Dummy encode the `department` column with a clear prefix for each dummy variable
df_enc = pd.get_dummies(df_enc, columns=['department'], drop_first=True, prefix='dept')

# Display the first few rows and structure of the new dataframe
df_enc.head()
Out[11]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident left promotion_last_5years salary dept_RandD dept_accounting dept_hr dept_management dept_marketing dept_product_mng dept_sales dept_support dept_technical
0 0.38 0.53 2 157 3 0 1 0 0 False False False False False False True False False
1 0.80 0.86 5 262 6 0 1 0 1 False False False False False False True False False
2 0.11 0.88 7 272 4 0 1 0 1 False False False False False False True False False
3 0.72 0.87 5 223 5 0 1 0 0 False False False False False False True False False
4 0.37 0.52 2 159 3 0 1 0 0 False False False False False False True False False
In [ ]:
 
In [40]:
# Set the figure size
plt.figure(figsize=(10, 8))

# Create the heatmap for selected features with improvements
heatmap = sns.heatmap(df_enc[['satisfaction_level', 'last_evaluation', 'number_project', 
                               'average_monthly_hours', 'tenure']].corr(), 
                      annot=True, fmt=".2f", linewidths=0.5, 
                      cmap="crest", cbar_kws={'shrink': 0.75})

# Customize colorbar tick labels for better readability
colorbar = heatmap.collections[0].colorbar
colorbar.ax.tick_params(labelsize=12)

# Improve title and label aesthetics
heatmap.set_title('Correlation Heatmap of Key Variables', fontsize=18, fontweight='bold', pad=20)

# Rotate x and y tick labels for better visibility
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, fontsize=12)
heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, fontsize=12)

# Adjust layout for better spacing and readability
plt.tight_layout()

# Show the heatmap
plt.show()
No description has been provided for this image
In [ ]:
 
In [41]:
# Create a crosstab of the counts of employees who left versus stayed by department
crosstab = pd.crosstab(df1['department'], df1['left'])

# Set the figure size
plt.figure(figsize=(12, 8))

# Plot the bar chart with improved logic and aesthetics
crosstab.plot(kind='bar', color=['purple', 'red'], edgecolor='black', alpha=0.8)

# Add titles and labels
plt.title('Employee Count by Department: Stayed vs. Left', fontsize=16, fontweight='bold')
plt.ylabel('Employee Count', fontsize=14)
plt.xlabel('Department', fontsize=14)

# Improve the legend
plt.legend(title='Employee Status', labels=['Stayed (0)', 'Left (1)'], fontsize=12, title_fontsize=14)

# Adjust tick size for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add gridlines for clarity
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [12]:
# Select rows without outliers in `tenure` and save resulting dataframe in a new variable
df_log = df_enc[(df_enc['tenure'] >= lower_limit) & (df_enc['tenure'] <= upper_limit)]

# Display first few rows of new dataframe
df_log.head()
Out[12]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident left promotion_last_5years salary dept_RandD dept_accounting dept_hr dept_management dept_marketing dept_product_mng dept_sales dept_support dept_technical
0 0.38 0.53 2 157 3 0 1 0 0 False False False False False False True False False
2 0.11 0.88 7 272 4 0 1 0 1 False False False False False False True False False
3 0.72 0.87 5 223 5 0 1 0 0 False False False False False False True False False
4 0.37 0.52 2 159 3 0 1 0 0 False False False False False False True False False
5 0.41 0.50 2 153 3 0 1 0 0 False False False False False False True False False
In [ ]:
 
In [10]:
# Isolate the outcome variable
y = df_log['left']
# Display first few rows of the outcome variable
y.head() 
Out[10]:
0    1
2    1
3    1
4    1
5    1
Name: left, dtype: int64
In [ ]:
 
In [26]:
df_log['left'].value_counts()


left
0    9285
1    1882
Name: count, dtype: int64
Out[26]:
left
0    9285
1    1882
Name: count, dtype: int64
In [11]:
# Select the features you want to use in your model
X = df_log.drop('left', axis=1)

# Display the first few rows of the selected features 
X.head()
Out[11]:
satisfaction_level last_evaluation number_project average_monthly_hours tenure work_accident promotion_last_5years salary dept_RandD dept_accounting dept_hr dept_management dept_marketing dept_product_mng dept_sales dept_support dept_technical
0 0.38 0.53 2 157 3 0 0 0 False False False False False False True False False
2 0.11 0.88 7 272 4 0 0 1 False False False False False False True False False
3 0.72 0.87 5 223 5 0 0 0 False False False False False False True False False
4 0.37 0.52 2 159 3 0 0 0 False False False False False False True False False
5 0.41 0.50 2 153 3 0 0 0 False False False False False False True False False
In [13]:
 
Out[13]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'tenure', 'work_accident',
       'promotion_last_5years', 'salary', 'dept_RandD', 'dept_accounting',
       'dept_hr', 'dept_management', 'dept_marketing', 'dept_product_mng',
       'dept_sales', 'dept_support', 'dept_technical'],
      dtype='object')
In [47]:
# Split the data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
In [ ]:
 
In [ ]:
 
In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# Scale the features for better convergence
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Construct a logistic regression model with improved parameters
log_clf = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', class_weight='balanced')  # Adjusted solver and class weight
log_clf.fit(X_train_scaled, y_train)

# Use the logistic regression model to get predictions on the scaled test set
y_pred = log_clf.predict(X_test_scaled)

# Compute values for confusion matrix
log_cm = confusion_matrix(y_test, y_pred, labels=log_clf.classes_)

# Create a display of the confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, 
                                  display_labels=log_clf.classes_)
In [51]:
# Create a DataFrame to compare predicted vs actual values
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Correct': y_test == y_pred  # Boolean column indicating if the prediction was correct
})

# Display the first few rows of the results DataFrame
results_df.head()
Out[51]:
Actual Predicted Correct
10368 0 0 True
6408 0 0 True
6129 0 0 True
964 1 0 False
11657 0 0 True
In [50]:
# Plot the confusion matrix with enhancements
plt.figure(figsize=(10, 8))
log_disp.plot(cmap=plt.cm.Blues, values_format='d', ax=plt.gca(), colorbar=True)

# Set titles and labels for clarity
plt.title('Confusion Matrix for Logistic Regression Model', fontsize=20, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=16)
plt.ylabel('True Label', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Add a grid for better visual reference
plt.grid(False)

# Display metrics on the confusion matrix
for i in range(len(log_cm)):
    for j in range(len(log_cm)):
        plt.text(j, i, f'{log_cm[i, j]}', ha='center', va='center', color='white' if log_cm[i, j] > log_cm.max()/2 else 'black', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model and fit it
log_clf = LogisticRegression(random_state=42, max_iter=5000).fit(X_train, y_train)

# Use the model to get predictions on the test set
y_pred = log_clf.predict(X_test)
y_proba = log_clf.predict_proba(X_test)[:, 1]  # Get predicted probabilities for the positive class

# Class distribution
class_distribution = df_log['left'].value_counts(normalize=True)
print("Class Distribution:")
print(class_distribution)

# Create classification report
target_names = ['Predicted would not leave', 'Predicted would leave']
class_report = classification_report(y_test, y_pred, target_names=target_names)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression Model: {accuracy:.2f}\n")

# Display classification report
print("Classification Report:")
print(class_report)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Class Distribution:
left
0    0.831468
1    0.168532
Name: proportion, dtype: float64
Accuracy of the Logistic Regression Model: 0.82

Classification Report:
                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.93      0.90      2321
    Predicted would leave       0.44      0.26      0.33       471

                 accuracy                           0.82      2792
                macro avg       0.65      0.60      0.61      2792
             weighted avg       0.79      0.82      0.80      2792

No description has been provided for this image
In [55]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)
average_precision = average_precision_score(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall Curve (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [56]:
# Cross-validation scores
cv_scores = cross_val_score(log_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validated Accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}")
Cross-Validated Accuracy: 0.83 +/- 0.00
In [57]:
# Display feature importance
importance = np.abs(log_clf.coef_[0])
feature_names = X_train.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance from Logistic Regression')
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [14]:
# import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Assume df_enc is your encoded dataframe from the previous steps
# Define the specific 7 features for training
selected_features = ['satisfaction_level', 'last_evaluation', 'number_project', 
                     'average_monthly_hours', 'tenure', 'salary', 'work_accident']

# Select only the chosen features
X = df_log[selected_features]  # Features (only the specified 7)
y = df_log['left']  # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier with specified defaults
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                            class_weight= 'balanced', criterion='gini', max_depth=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, n_estimators=1000)

# Fit the model to the training data
rf.fit(X_train, y_train)
Out[14]:
RandomForestClassifier(class_weight='balanced', n_estimators=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', n_estimators=1000)
In [21]:
# Save X_train, X_test, y_train, and y_test to CSV files in the specified path
output_path = "C:/Users/HP/OneDrive/Documents/"

# Save training and test sets
X_train.to_csv(output_path + "X_train.csv", index=False)
X_test.to_csv(output_path + "X_test.csv", index=False)
y_train.to_csv(output_path + "y_train.csv", index=False)
y_test.to_csv(output_path + "y_test.csv", index=False)

print("Files have been saved successfully!")
Files have been saved successfully!
In [16]:
X.columns
Out[16]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'tenure', 'work_accident',
       'promotion_last_5years', 'salary', 'dept_RandD', 'dept_accounting',
       'dept_hr', 'dept_management', 'dept_marketing', 'dept_product_mng',
       'dept_sales', 'dept_support', 'dept_technical'],
      dtype='object')
In [16]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred))
Test set accuracy: 0.9839
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1846
           1       0.99      0.92      0.95       388

    accuracy                           0.98      2234
   macro avg       0.98      0.96      0.97      2234
weighted avg       0.98      0.98      0.98      2234

In [23]:
# Create a DataFrame to compare predicted vs actual values
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Correct': y_test == y_pred  # Boolean column indicating if the prediction was correct
})

# Display the first few rows of the results DataFrame
results_df.head(10)
Out[23]:
Actual Predicted Correct
3830 0 0 True
7180 0 0 True
988 1 1 True
157 1 1 True
11854 0 0 True
7446 0 0 True
3443 0 0 True
4895 0 0 True
7057 0 0 True
1594 1 1 True
In [15]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
perm_importance = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

# Create a DataFrame
perm_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': perm_importance.importances_mean
})

# Sort by importance and select top 7
perm_df = perm_df.sort_values(by='Importance', ascending=False).head(7)

# Plot permutation feature importance for top 7 features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_df, hue='Feature', dodge=False, palette='magma', legend=False)

# Set title and labels
plt.title('Top 7 Permutation Feature Importance', fontsize=16)
plt.xlabel('Mean Decrease in Accuracy', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Adjust layout and show plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [27]:
df_log.info()
<class 'pandas.core.frame.DataFrame'>
Index: 11167 entries, 0 to 11999
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     11167 non-null  float64
 1   last_evaluation        11167 non-null  float64
 2   number_project         11167 non-null  int64  
 3   average_monthly_hours  11167 non-null  int64  
 4   tenure                 11167 non-null  int64  
 5   work_accident          11167 non-null  int64  
 6   left                   11167 non-null  int64  
 7   promotion_last_5years  11167 non-null  int64  
 8   salary                 11167 non-null  int8   
 9   dept_RandD             11167 non-null  bool   
 10  dept_accounting        11167 non-null  bool   
 11  dept_hr                11167 non-null  bool   
 12  dept_management        11167 non-null  bool   
 13  dept_marketing         11167 non-null  bool   
 14  dept_product_mng       11167 non-null  bool   
 15  dept_sales             11167 non-null  bool   
 16  dept_support           11167 non-null  bool   
 17  dept_technical         11167 non-null  bool   
dtypes: bool(9), float64(2), int64(6), int8(1)
memory usage: 894.2 KB
In [ ]:
 
In [20]:
import joblib

# Save the model
joblib.dump(rf, 'random_forest_model.joblib')

# Load the model later
loaded_model = joblib.load('random_forest_model.joblib')
In [18]:
import joblib
# Load the model later
loaded_model = joblib.load('random_forest_model')
In [ ]:
# Function to take user input
def get_user_input():
    print("Please enter the following details about the employee:")
    
    satisfaction_level = float(input("Satisfaction level (0-1): "))
    last_evaluation = float(input("Last evaluation (0-1): "))
    number_project = int(input("Number of projects: "))
    average_monthly_hours = int(input("Average monthly hours: "))
    tenure = int(input("Tenure (in years): "))
    work_accident = int(input("Had work accident? (0 = No, 1 = Yes): "))
    salary = int(input("Salary level (0 = low, 1 = medium, 2 = high): "))
    
    # Create the input data in the same format as training data
    input_data = np.array([[satisfaction_level, last_evaluation, number_project,
                            average_monthly_hours, tenure, work_accident, salary]])
    return input_data

# Get user input
new_data = get_user_input()

# Make a prediction
prediction = loaded_model.predict(new_data)

# Output the prediction
if prediction[0] == 1:
    print("Prediction: The employee is likely to leave.")
else:
    print("Prediction: The employee is likely to stay.")
    
    
Please enter the following details about the employee:
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: